import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
from autogluon.tabular import TabularDataset, TabularPredictor
'ignore') warnings.filterwarnings(
imports
= pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train1 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_test.csv') df_test
(df_train1.shape, df_train1.is_fraud.mean()), (df_test.shape, df_test.is_fraud.mean())
(((734003, 22), 0.005728859418830713), ((314572, 22), 0.005725239372862174))
= pd.concat([df_train1, df_test])
_df1 = pd.concat([df_train2, df_test])
_df2 = pd.concat([df_train3, df_test])
_df3 = pd.concat([df_train4, df_test])
_df4 = pd.concat([df_train5, df_test])
_df5 = pd.concat([df_train6, df_test])
_df6 = pd.concat([df_train7, df_test])
_df7 = pd.concat([df_train8, df_test]) _df8
= _df1.is_fraud.mean()
_df1_mean = _df2.is_fraud.mean()
_df2_mean = _df3.is_fraud.mean()
_df3_mean = _df4.is_fraud.mean()
_df4_mean = _df5.is_fraud.mean()
_df5_mean = _df6.is_fraud.mean()
_df6_mean = _df7.is_fraud.mean()
_df7_mean = _df8.is_fraud.mean() _df8_mean
= df_train4[["amt","is_fraud"]]
df_tr = df_80[["amt","is_fraud"]]
df_tst = TabularDataset(df_tr)
tr = TabularDataset(df_tst)
tst = TabularPredictor(label="is_fraud", verbosity=1)
predictr = time.time()
t1 predictr.fit(tr)
No path specified. Models will be saved in: "AutogluonModels/ag-20240520_100601/"
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f9f65c436d0>
predictr.calibrate_decision_threshold()
0.5
predictr.predict(df_80)
0 0
1 0
2 0
3 0
5 0
..
314538 0
314558 0
314563 0
314566 0
314571 0
Name: is_fraud, Length: 231011, dtype: int64
= predictr.predict_proba(df_80).iloc[:,-1]
yyhat_prob = df_80.is_fraud yy
= df_80.assign(yyhat_prob= yyhat_prob)\
_df 'amt','is_fraud','yyhat_prob']] .loc[:,[
import sklearn.metrics
sklearn.metrics.roc_auc_score(yy,yyhat_prob)
0.886678870980294
== 1].yyhat_prob.mean() _df[_df.is_fraud
0.3329286185951976
== 1].amt.mean() _df[_df.is_fraud
16.30444155844156
== 1].amt.hist() _df[_df.is_fraud
== 0].yyhat_prob.mean() _df[_df.is_fraud
0.2986325147135913
== 0].amt.mean() _df[_df.is_fraud
31.54455620788636
== 0].amt.hist() _df[_df.is_fraud
predictr.predict_proba_multi()
{'KNeighborsUnif': 0 1
13979 1.0 0.0
24323 1.0 0.0
23007 1.0 0.0
37276 1.0 0.0
31070 1.0 0.0
... ... ...
36224 1.0 0.0
34032 1.0 0.0
34720 1.0 0.0
17138 1.0 0.0
25898 1.0 0.0
[2500 rows x 2 columns],
'KNeighborsDist': 0 1
13979 1.0 0.0
24323 1.0 0.0
23007 1.0 0.0
37276 1.0 0.0
31070 1.0 0.0
... ... ...
36224 1.0 0.0
34032 1.0 0.0
34720 1.0 0.0
17138 1.0 0.0
25898 1.0 0.0
[2500 rows x 2 columns],
'ExtraTreesEntr': 0 1
13979 1.0 0.0
24323 1.0 0.0
23007 1.0 0.0
37276 1.0 0.0
31070 1.0 0.0
... ... ...
36224 1.0 0.0
34032 1.0 0.0
34720 1.0 0.0
17138 1.0 0.0
25898 1.0 0.0
[2500 rows x 2 columns],
'LightGBM': 0 1
13979 0.967662 0.032338
24323 0.965496 0.034504
23007 0.969491 0.030509
37276 0.956655 0.043345
31070 0.969491 0.030509
... ... ...
36224 0.967500 0.032500
34032 0.969491 0.030509
34720 0.969491 0.030509
17138 0.967500 0.032500
25898 0.969491 0.030509
[2500 rows x 2 columns],
'RandomForestEntr': 0 1
13979 1.0 0.0
24323 1.0 0.0
23007 1.0 0.0
37276 1.0 0.0
31070 1.0 0.0
... ... ...
36224 1.0 0.0
34032 1.0 0.0
34720 1.0 0.0
17138 1.0 0.0
25898 1.0 0.0
[2500 rows x 2 columns],
'CatBoost': 0 1
13979 0.987880 0.012120
24323 0.983165 0.016835
23007 0.990823 0.009177
37276 0.963983 0.036017
31070 0.990753 0.009247
... ... ...
36224 0.972353 0.027647
34032 0.991516 0.008484
34720 0.984103 0.015897
17138 0.982423 0.017577
25898 0.987116 0.012884
[2500 rows x 2 columns],
'LightGBMLarge': 0 1
13979 0.966235 0.033765
24323 0.970804 0.029196
23007 0.970804 0.029196
37276 0.957979 0.042021
31070 0.970804 0.029196
... ... ...
36224 0.966553 0.033447
34032 0.970804 0.029196
34720 0.970804 0.029196
17138 0.970664 0.029336
25898 0.970804 0.029196
[2500 rows x 2 columns],
'NeuralNetFastAI': 0 1
13979 0.995330 0.004670
24323 0.994297 0.005703
23007 0.996624 0.003376
37276 0.888140 0.111860
31070 0.996497 0.003503
... ... ...
36224 0.914898 0.085102
34032 0.998039 0.001961
34720 0.998416 0.001583
17138 0.953835 0.046165
25898 0.998450 0.001550
[2500 rows x 2 columns],
'LightGBMXT': 0 1
13979 0.981476 0.018524
24323 0.974396 0.025604
23007 0.988903 0.011097
37276 0.972741 0.027259
31070 0.988759 0.011241
... ... ...
36224 0.986411 0.013589
34032 0.995874 0.004126
34720 0.992955 0.007045
17138 0.993169 0.006831
25898 0.994273 0.005727
[2500 rows x 2 columns],
'XGBoost': 0 1
13979 0.549148 0.450852
24323 0.549148 0.450852
23007 0.549148 0.450852
37276 0.549276 0.450724
31070 0.549148 0.450852
... ... ...
36224 0.549276 0.450724
34032 0.549148 0.450852
34720 0.549148 0.450852
17138 0.549276 0.450724
25898 0.549148 0.450852
[2500 rows x 2 columns],
'NeuralNetTorch': 0 1
13979 0.998562 0.001438
24323 0.995813 0.004187
23007 0.999538 0.000462
37276 0.979087 0.020913
31070 0.999517 0.000483
... ... ...
36224 0.993657 0.006343
34032 0.999578 0.000422
34720 0.997371 0.002629
17138 0.998840 0.001160
25898 0.997434 0.002566
[2500 rows x 2 columns],
'ExtraTreesGini': 0 1
13979 1.0 0.0
24323 1.0 0.0
23007 1.0 0.0
37276 1.0 0.0
31070 1.0 0.0
... ... ...
36224 1.0 0.0
34032 1.0 0.0
34720 1.0 0.0
17138 1.0 0.0
25898 1.0 0.0
[2500 rows x 2 columns],
'RandomForestGini': 0 1
13979 1.0 0.0
24323 1.0 0.0
23007 1.0 0.0
37276 1.0 0.0
31070 1.0 0.0
... ... ...
36224 1.0 0.0
34032 1.0 0.0
34720 1.0 0.0
17138 1.0 0.0
25898 1.0 0.0
[2500 rows x 2 columns],
'WeightedEnsemble_L2': 0 1
13979 0.711494 0.288506
24323 0.710591 0.289409
23007 0.711927 0.288073
37276 0.705633 0.294367
31070 0.711921 0.288079
... ... ...
36224 0.710353 0.289647
34032 0.711936 0.288064
34720 0.711394 0.288606
17138 0.711626 0.288374
25898 0.711410 0.288590
[2500 rows x 2 columns]}
def auto_amt_ver0503(df_tr, df_tst, _df_mean):
= df_tr[["amt","is_fraud"]]
df_tr = df_tst[["amt","is_fraud"]]
df_tst = TabularDataset(df_tr)
tr = TabularDataset(df_tst)
tst = TabularPredictor(label="is_fraud", verbosity=1)
predictr = time.time()
t1
predictr.fit(tr)= time.time()
t2 = t2 - t1
time_diff = predictr._trainer.model_graph.nodes
models = []
results for model_name in models:
# 모델 평가
= predictr.evaluate(tst, model=model_name)
eval_result
# 결과를 데이터프레임에 추가
'model': model_name,
results.append({'acc': eval_result['accuracy'],
'pre': eval_result['precision'],
'rec': eval_result['recall'],
'f1': eval_result['f1'],
'auc': eval_result['roc_auc']})
= []
model = []
time_diff = []
acc = []
pre = []
rec = []
f1 = []
auc = []
graph_based = []
method = []
throw_rate = []
train_size = []
train_cols = []
train_frate = []
test_size = []
test_frate = []
hyper_params
for result in results:
= result['model']
model_name
model.append(model_name)None) # 각 모델별로 학습한 시간을 나타내고 싶은데 잘 안됨
time_diff.append('acc'])
acc.append(result['pre'])
pre.append(result['rec'])
rec.append(result['f1'])
f1.append(result['auc'])
auc.append(result[False)
graph_based.append('Autogluon')
method.append(
throw_rate.append(_df_mean)len(tr))
train_size.append(for col in tr.columns if col != 'is_fraud'])
train_cols.append([col
train_frate.append(tr.is_fraud.mean())len(tst))
test_size.append(
test_frate.append(tst.is_fraud.mean())None)
hyper_params.append(
= pd.DataFrame(dict(
df_results =model,
model=time_diff,
time=acc,
acc=pre,
pre=rec,
rec=f1,
f1=auc,
auc=graph_based,
graph_based=method,
method=throw_rate,
throw_rate=train_size,
train_size=train_cols,
train_cols=train_frate,
train_frate=test_size,
test_size=test_frate,
test_frate=hyper_params
hyper_params
)) = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
ymdhms f'../results2/{ymdhms}-Autogluon.csv',index=False)
df_results.to_csv(return df_results
amt 80 미만 잘 잡는지 확인용..
= df_test[df_test['amt'] <= 80] df_80
df_80.shape, df_80.is_fraud.mean()
((231011, 22), 0.0016665873053664112)
= pd.concat([df_train1, df_80])
_df1_ = pd.concat([df_train2, df_80])
_df2_ = pd.concat([df_train3, df_80])
_df3_ = pd.concat([df_train4, df_80])
_df4_ = pd.concat([df_train5, df_80])
_df5_ = pd.concat([df_train6, df_80])
_df6_ = pd.concat([df_train7, df_80])
_df7_ = pd.concat([df_train8, df_80]) _df8_
= _df1_.is_fraud.mean()
_df1_mean_ = _df2_.is_fraud.mean()
_df2_mean_ = _df3_.is_fraud.mean()
_df3_mean_ = _df4_.is_fraud.mean()
_df4_mean_ = _df5_.is_fraud.mean()
_df5_mean_ = _df6_.is_fraud.mean()
_df6_mean_ = _df7_.is_fraud.mean()
_df7_mean_ = _df8_.is_fraud.mean() _df8_mean_